{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import numpy as np \n",
    "from scipy.sparse import random \n",
    "from sklearn.decomposition import TruncatedSVD \n",
    "import matplotlib.pyplot as plt \n",
    "\n",
    "# Generate a corpus of 100 documents, each containing 1000 words \n",
    "vocab_size = 10000 \n",
    "num_docs = 100 \n",
    "doc_len = 1000 \n",
    "\n",
    "# Create a vocabulary of 10000 words \n",
    "vocab = [f'word{i}' for i in range(vocab_size)] \n",
    "\n",
    "# Generate a random dense vector representing each document \n",
    "dense_vectors = np.zeros((num_docs, vocab_size)) \n",
    "for i in range(num_docs): \n",
    "    word_indices = np.random.choice(vocab_size, doc_len) \n",
    "    for j in word_indices: \n",
    "        dense_vectors[i, j] += 1 \n",
    "\n",
    "# Convert Create the dense vectors to sparse format \n",
    "sparse_vectors = random(num_docs, vocab_size, density=0.01, format='csr') \n",
    "for i in range(num_docs): \n",
    "    word_indices = np.random.choice(vocab_size, doc_len) \n",
    "    for j in word_indices: \n",
    "        sparse_vectors[i, j] += 1 \n",
    "\n",
    "# Use TruncatedSVD to reduce the dimensionality of the dense vectors \n",
    "svd = TruncatedSVD(n_components=2) \n",
    "dense_vectors_svd = svd.fit_transform(dense_vectors) \n",
    "\n",
    "# Apply TruncatedSVD to the sparse vectors\n",
    "sparse_vectors_svd = svd.transform(sparse_vectors)\n",
    "\n",
    "# Plot the dense and sparse vectors on a scatter plot \n",
    "fig, ax = plt.subplots(figsize=(10, 8)) \n",
    "ax.scatter(dense_vectors_svd[:, 0], dense_vectors_svd[:, 1], c='b', label='Dense vectors') \n",
    "ax.scatter(sparse_vectors_svd[:, 0], sparse_vectors_svd[:, 1], c='r', label='Sparse vectors') \n",
    "ax.set_title('2D embeddings of dense and sparse document vectors after TruncatedSVD dimensionality reduction') \n",
    "ax.set_xlabel('Dimension 1') \n",
    "ax.set_ylabel('Dimension 2') \n",
    "ax.legend() \n",
    "plt.show()\n"
   ]
  }
 ],
 "metadata": {
  "language_info": {
   "name": "python"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
